import pandas as pd
import numpy as np
from datetime import datetime
import calendar
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import accuracy_score
import statsmodels.tsa.stattools as sts
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.graphics.tsaplots as tsa
import statsmodels.stats.diagnostic as diag
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, kpss
from scipy import signal
import statsmodels.api as sm
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from pmdarima.arima import ADFTest
from pmdarima.arima import auto_arima
from gluonts.dataset.common import ListDataset
from gluonts.model.deepar import DeepAREstimator
from gluonts.mx.trainer import Trainer
from gluonts.evaluation import make_evaluation_predictions, Evaluator
from gluonts.dataset.util import to_pandas
import fbprophet
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, plot_components_plotly
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
plt.rcParams["figure.figsize"] = (12,6)
df = pd.read_csv("hotel_bookings.csv")
df.head(5)
df.shape
df.isna().sum()
df.info()
df["children"].fillna(0, inplace = True)
df["children"] = df["children"].astype(int)
df = df.rename(columns = {"arrival_date_year" : "year", "arrival_date_day_of_month" : "day"})
df["month"] = df["arrival_date_month"].apply(lambda x : datetime.strptime((x[0:3]), "%b").month)
df["datetime"] = pd.to_datetime(df[["year", "month", "day"]])
filter = (df["adults"] == 0) & (df["children"] == 0) & (df["babies"] == 0)
df = df[~filter]
df.shape
df["reservation_status_date"] = pd.to_datetime(df["reservation_status_date"])
df["Total Guests"] = df["adults"] + df["children"] + df["babies"]
df["Total stay"] = df["stays_in_week_nights"] + df["stays_in_weekend_nights"]
df[df["adults"] == 0][["adults", "children", "babies"]]
data = df.copy()
data = data.set_index("datetime")
data = data.sort_index()
data.shape
rh = data[data["hotel"] == "Resort Hotel"]
ch = data[data["hotel"] == "City Hotel"]
rh_canceled = rh[rh["is_canceled"] == 1]
rh_checked = rh[rh["is_canceled"] == 0]
ch_canceled = ch[ch["is_canceled"] == 1]
ch_checked = ch[ch["is_canceled"] == 0]
rh_ts = rh_checked[["Total Guests"]]
rh_ts = rh_ts.groupby(rh_ts.index).sum()
ch_ts = ch_checked[["Total Guests"]]
ch_ts = ch_ts.groupby(ch_ts.index).sum()
rh_ts_weekly = rh_ts.resample("W").sum()
ch_ts_weekly = ch_ts.resample("W").sum()
rh_num = pd.DataFrame(rh.groupby(rh.index).size(), columns = ["Number_of_Bookings"])
ch_num = pd.DataFrame(ch.groupby(ch.index).size(), columns = ["Number_of_Bookings"])
rh_num.head(5)
ch_num.head(5)
def visualize_time_series(df, col_name):
fig = px.line(df, x=df.index, y=col_name)
fig.update_xaxes(
rangeslider_visible=True,
rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(step="all")
])
)
)
fig.show()
The Ljung Box test is a way to test for the absence of serial autocorrelation, up to a specified lag k. The test determines whether or not errors are white noise or whether there is something more behind them.
If p-value less than significance level, then it is not pure white noise.
def check_for_white_noise(df):
autocorrelation_plot(df)
plt.show()
tsa.plot_acf(df, lags = 40, alpha = 0.05, title = 'Auto correlation plot')
a = diag.acorr_ljungbox(df, lags = [40], boxpierce = True)
print(a)
def decomposition(df):
# multiplicative_decomposition = seasonal_decompose(df["dp_Value"], model = "multiplicative", extrapolate_trend = "freq")
additive_decomposition = seasonal_decompose(df, model = "additive", extrapolate_trend = "freq", freq = 7)
plt.rcParams.update({'figure.figsize' : (16,12)})
# multiplicative_decomposition.plot().suptitle('Multiplicative Decomposition', fontsize = 16)
additive_decomposition.plot().suptitle("Additive Decomposition", fontsize = 16)
plt.show()
def plot_rolling_mean(df, col_name, window):
rolling_mean = df.rolling(window = window).mean()
rolling_std = df.rolling(window = window).std()
fig = go.Figure()
trace1 = go.Scatter(x = df.index, y = df[col_name], mode = "lines", name = col_name)
trace2 = go.Scatter(x = rolling_mean.index, y = rolling_mean[col_name], mode = "lines", name = "Rolling Mean")
trace3 = go.Scatter(x = rolling_std.index, y = rolling_std[col_name], mode = "lines", name = "Rolling Standard Deviation")
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.update_layout(width = 1100)
fig.show()
def adf_test(df):
result = adfuller(df, autolag = "AIC")
dfoutput = pd.Series(result[0:4], index=['Test Statistic','p-value',' Lags Used','Number of Observations Used'])
print('The test statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
print('%s: %.3f' % (key, value))
if result[1]<0.05:
print("The p-value is less than 0.05, hence we can reject the null hypothesis and say that the time series is stationary")
else:
print("The series is not stationary")
def kpss_test(df):
result = kpss(df, regression='c')
print('\nKPSS Statistic: %f' % result[0])
print('p-value: %f' % result[1])
for key, value in result[3].items():
print('Critial Values:')
print(f' \t\t {key}: \t {value}')
print("Result: The series is {} stationary".format("not" if result[1]<0.05 else ""))
def get_train_test(df, split):
split = split
df_train = df[0:-split]
df_test = df[-split:]
return(df_train, df_test)
def forecast_plotly_data(self, training_data, prediction_intervals=(50.0, 90.0), show_mean=False, color="b", label=None, output_file=None, *args, **kwargs,):
fig = go.Figure()
label_prefix = "" if label is None else label + "-"
for c in prediction_intervals:
assert 0.0 <= c <= 100.0
ps = [50.0] + [
50.0 + f * c / 2.0
for c in prediction_intervals
for f in [-1.0, +1.0]
]
percentiles_sorted = sorted(set(ps))
def alpha_for_percentile(p):
return (p / 100.0) ** 1.5
ps_data = [self.quantile(p / 100.0) for p in percentiles_sorted]
i_p50 = len(percentiles_sorted) // 2
p50_data = ps_data[i_p50]
p50_series = pd.Series(data=p50_data, index=self.index)
data = to_pandas(training_data)[-365:]
trace1 = go.Scatter(
name = "Device Values",
x = data.index,
y = data,
mode = "lines",
line=dict(
color='darkgreen',
width=1
)
# line = dict(color = 'rgb(0,10,255)'),
)
fig.add_trace(trace1)
trace2 = go.Scatter(
name='Forecast Median',
x=p50_series.index,
y=p50_series,
mode='lines',
line=dict(
color='orange',
width=2
)
# line=dict(color='rgb(255, 0, 0)'),
)
fig.add_trace(trace2)
opacity_colour = ['rgba(250, 0, 0, 0.2)', 'rgba(250, 0, 0, 0.4)']
for i in range(len(percentiles_sorted) // 2):
ptile = percentiles_sorted[i]
alpha = alpha_for_percentile(ptile)
x = p50_series.index
y_lower = ps_data[i]
y_upper = ps_data[-i-1]
fig.add_trace(go.Scatter(
name=f"{label_prefix}{100 - ptile * 2}%",
x=x,
y=y_upper,
mode='lines',
marker=dict(color="#444"),
line=dict(width=0),
showlegend=False
)),
fig.add_trace(go.Scatter(
name=f"{label_prefix}{100 - ptile * 2}%",
x=x,
y=y_lower,
marker=dict(color="#444"),
line=dict(width=0),
mode='lines',
fillcolor=opacity_colour[i],
fill='tonexty',
showlegend=True
))
return(fig)
def get_evaluation_gluonts(test_data, predictor):
forecast_it, ts_it = make_evaluation_predictions(
dataset=test_data, # test dataset
predictor=predictor, # predictor
num_samples=100, # number of sample paths we want for evaluation
)
forecasts = list(forecast_it)
tss = list(ts_it)
evaluator = Evaluator()
agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=1)
return(agg_metrics, item_metrics)
visualize_time_series(rh_num, "Number_of_Bookings")
check_for_white_noise(rh_num["Number_of_Bookings"])
decomposition(rh_num["Number_of_Bookings"])
plot_rolling_mean(rh_num,"Number_of_Bookings",30)
adf_test(rh_num["Number_of_Bookings"])
pd.date_range(start = rh_num.index.values[0], end = rh_num.index.values[-1]).difference(rh_num.index)
rh_train, rh_test = get_train_test(rh_num, 60)
rh_training_data = ListDataset(
[{"start": rh_train.index[0], "target": rh_train["Number_of_Bookings"]}],
freq = "D"
)
rh_test_data = ListDataset(
[{'start' : rh_num.index[0], "target" : rh_num["Number_of_Bookings"]}],
freq = "D"
)
prediction_length = 60
estimator = DeepAREstimator(freq="D", prediction_length=prediction_length, trainer=Trainer(epochs=30, num_batches_per_epoch = 50, learning_rate = 1e-3,batch_size = 32))
predictor = estimator.train(training_data=rh_training_data)
rh_forecast = predictor.predict(rh_training_data)
rh_forecast_list = list(rh_forecast)
prediction_interval = [50.0,98.0]
fig = forecast_plotly_data(rh_forecast_list[0], list(rh_test_data)[0], prediction_intervals = prediction_interval)
fig.update_xaxes(
rangeslider_visible=True,
rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(count = 9, label = "9m", step = "month", stepmode = "backward"),
dict(step="all")
])
)
)
fig.update_layout(title = "Deep AR Forecasting for Number of Bookings - Resort Hotel", width = 1050)
fig.show()
fig.update_layout(width = 1500)
with open("Forecasting_Graphs.html", "w") as f:
f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
forecast_it, ts_it = make_evaluation_predictions(
dataset=rh_test_data, # test dataset
predictor=predictor, # predictor
num_samples=100, # number of sample paths we want for evaluation
)
rh_forecasts = list(forecast_it)
rh_tss = list(ts_it)
evaluator = Evaluator()
agg_metrics, item_metrics = evaluator(iter(rh_tss), iter(rh_forecasts), num_series=1)
rh_metrics_dict = {}
metrics_list = ['MSE','RMSE', 'MASE','MAPE','sMAPE']
for metric in metrics_list:
rh_metrics_dict[metric] = agg_metrics.get(metric)
print(metric + ": ", rh_metrics_dict[metric])
def fit_auto_arima(df_train, df_test, m, hotel):
arima_model = auto_arima(df_train, start_p = 1, d = 0, start_q = 1, max_p = 3, max_d = 2, max_q = 3, start_P = 0, D = 1, start_Q = 0, m = m, seasonal = True, error_action = 'warn', trace = True, supress_warnings = True, stepwise = True)
arima_model.summary()
prediction = pd.DataFrame(arima_model.predict(n_periods = len(df_test), index = df_test.index))
prediction.columns = ['predicted_values']
plt.figure(figsize = (14,4))
plt.plot(df_train, label = "Training")
plt.plot(df_test, label = "Test")
plt.plot(df_test.index,prediction, label = "Predicted")
plt.title(f"Auto ARIMA for number of bookings for {hotel}")
plt.legend()
plt.show()
return(arima_model,prediction)
rh_train, rh_test = get_train_test(rh_num, 60)
arima_model, arima_pred = fit_auto_arima(rh_train, rh_test, 12, "resort")
def performance_metrics_auto_arima(y_true, y_pred):
print(f"The performance metrics are : ")
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
print("Mean squared error is ", round(mse,4))
print("Root mean squared error is ", round(rmse,4))
print("Mean absolute error is ", round(mae,4))
print("Mean absolute percentage error is ", round(mape*100,4))
performance_metrics_auto_arima(rh_test["Number_of_Bookings"], arima_pred.predicted_values)
rh_m = Prophet()
rh_prophet = rh_num.reset_index()
rh_prophet = rh_prophet.rename(columns = {"datetime" : "ds", "Number_of_Bookings" : "y"})
rh_prophet.head(5)
prh_train, prh_test = get_train_test(rh_prophet, 60)
rh_m.fit(prh_train)
date_list = pd.date_range(start = prh_test.ds.iloc[0], end = prh_test.ds.iloc[-1])
future = pd.DataFrame(date_list, columns = ["ds"])
forecast_rh_prophet = rh_m.predict(future)
def plot_forecast_prophet_algo(df_train, df_test, forecast, title):
fig = go.Figure()
fig.add_trace(go.Scatter(
name = "Number of Bookings",
x = df_train.ds[-100:],
y = df_train.y[-100:],
mode = "lines",
line=dict(
color='blue',
width=1
)))
fig.add_trace(go.Scatter(
name = "Number of Bookings",
x = df_test.ds,
y = df_test.y,
mode = "lines",
line=dict(
color='darkgreen',
width=1
)))
opacity_colour = ['rgba(250, 0, 0, 0.2)', 'rgba(250, 0, 0, 0.4)']
fig.add_trace(go.Scatter(
name = "Predicted",
x = forecast.ds,
y = forecast.yhat,
mode = "lines",
line=dict(
color='orange',
width=1
)))
fig.add_trace(go.Scatter(
name="Predicted_upper",
x=forecast.ds,
y=forecast.yhat_upper,
mode='lines',
marker=dict(color="#444"),
line=dict(width=0),
showlegend=False
))
fig.add_trace(go.Scatter(
name="Predicted_lower",
x=forecast.ds,
y=forecast.yhat_lower,
marker=dict(color="#444"),
line=dict(width=0),
mode='lines',
fillcolor=opacity_colour[0],
fill='tonexty',
showlegend=True
))
fig.update_layout(title = f"Prophet Forecasting for Number of Bookings in Future Dates - {title}", width = 1050)
fig.show()
return(fig)
fig_prophet_rh = plot_forecast_prophet_algo(prh_train, prh_test, forecast_rh_prophet, "Resort")
fig_prophet_rh.update_layout(width = 1500)
with open("Forecasting_Graphs.html", "a") as f:
f.write(fig_prophet_rh.to_html(full_html = False, include_plotlyjs='cdn'))
def performance_metrics_prophet(y_true, y_pred):
print(f"The performance metrics are: ")
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
meae = median_absolute_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
accuracy_mape = (100-(mape*100))
print("Mean squared error is ", round(mse,4))
print("Mean absolute error is ", round(mae,4))
print("Root Mean Squared Error is ", round(rmse,4))
print("Mean absolute percentage error is ", round(mape*100,4))
y_true = prh_test.y.values
y_pred = forecast_rh_prophet.yhat.values
performance_metrics_prophet(y_true, y_pred)
visualize_time_series(ch_num, "Number_of_Bookings")
# check_for_white_noise(ch_num["Number_of_Bookings"])
decomposition(ch_num["Number_of_Bookings"])
plot_rolling_mean(ch_num,"Number_of_Bookings",30)
adf_test(ch_num["Number_of_Bookings"])
pd.date_range(start = ch_num.index.values[0], end = ch_num.index.values[-1]).difference(ch_num.index)
ch_train, ch_test = get_train_test(ch_num, 60)
ch_training_data = ListDataset(
[{"start": ch_train.index[0], "target": ch_train["Number_of_Bookings"]}],
freq = "D"
)
ch_test_data = ListDataset(
[{'start' : ch_num.index[0], "target" : ch_num["Number_of_Bookings"]}],
freq = "D"
)
prediction_length = 60
estimator_ch = DeepAREstimator(freq="D", prediction_length=prediction_length, trainer=Trainer(epochs=30, num_batches_per_epoch = 50, learning_rate = 1e-3,batch_size = 32))
predictor_ch = estimator_ch.train(training_data=ch_training_data)
ch_forecast = predictor.predict(ch_training_data)
ch_forecast_list = list(ch_forecast)
prediction_interval = [50.0,99.0]
fig = forecast_plotly_data(ch_forecast_list[0], list(ch_test_data)[0], prediction_intervals = prediction_interval)
fig.update_xaxes(
rangeslider_visible=True,
rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(count = 9, label = "9m", step = "month", stepmode = "backward"),
dict(step="all")
])
)
)
fig.update_layout(title = "Deep AR Forecasting for Number of Bookings - City Hotel", width = 1050)
fig.show()
fig.update_layout(width = 1500)
with open("Forecasting_Graphs.html", "a") as f:
f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
forecast_it, ts_it = make_evaluation_predictions(
dataset=rh_test_data, # test dataset
predictor=predictor, # predictor
num_samples=100, # number of sample paths we want for evaluation
)
ch_forecasts = list(forecast_it)
ch_tss = list(ts_it)
evaluator = Evaluator()
agg_metrics, item_metrics = evaluator(iter(ch_tss), iter(ch_forecasts), num_series=1)
ch_metrics_dict = {}
metrics_list = ['MSE','RMSE', 'MASE','MAPE','sMAPE']
for metric in metrics_list:
rh_metrics_dict[metric] = agg_metrics.get(metric)
print(metric + ": ", rh_metrics_dict[metric])
def fit_auto_arima(df_train, df_test, m, hotel):
arima_model = auto_arima(df_train, start_p = 1, d = 0, start_q = 1, max_p = 3, max_d = 2, max_q = 3, start_P = 0, D = 1, start_Q = 0, m = m, seasonal = True, error_action = 'warn', trace = True, supress_warnings = True, stepwise = True)
arima_model.summary()
prediction = pd.DataFrame(arima_model.predict(n_periods = len(df_test), index = df_test.index))
prediction.columns = ['predicted_values']
plt.figure(figsize = (14,4))
plt.plot(df_train, label = "Training")
plt.plot(df_test, label = "Test")
plt.plot(df_test.index,prediction, label = "Predicted")
plt.title(f"Auto ARIMA for number of bookings for {hotel}")
plt.legend()
plt.show()
return(arima_model,prediction)
ch_train, ch_test = get_train_test(ch_num, 60)
arima_model_ch, arima_pred_ch = fit_auto_arima(ch_train, ch_test, 12, "City")
def performance_metrics_auto_arima(y_true, y_pred):
print(f"The performance metrics are : ")
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
print("Mean squared error is ", round(mse,4))
print("Root mean squared error is ", round(rmse,4))
print("Mean absolute error is ", round(mae,4))
print("Mean absolute percentage error is ", round(mape*100,4))
performance_metrics_auto_arima(ch_test["Number_of_Bookings"], arima_pred_ch.predicted_values)
ch_m = Prophet()
ch_prophet = ch_num.reset_index()
ch_prophet = ch_prophet.rename(columns = {"datetime" : "ds", "Number_of_Bookings" : "y"})
ch_prophet.head(5)
pch_train, pch_test = get_train_test(ch_prophet, 60)
ch_m.fit(pch_train)
date_list = pd.date_range(start = pch_test.ds.iloc[0], end = pch_test.ds.iloc[-1])
future = pd.DataFrame(date_list, columns = ["ds"])
forecast_ch_prophet = ch_m.predict(future)
fig_prophet_ch = plot_forecast_prophet_algo(pch_train, pch_test, forecast_ch_prophet, "City")
fig_prophet_ch.update_layout(width = 1500)
with open("Forecasting_Graphs.html", "a") as f:
f.write(fig_prophet_ch.to_html(full_html = False, include_plotlyjs='cdn'))
y_true = pch_test.y.values
y_pred = forecast_ch_prophet.yhat.values
performance_metrics_prophet(y_true, y_pred)